#!/usr/bin/env python3

"""
Nagios plugin to check the difference between the primary and
secondary Postgres servers' xlog location.
"""
if False:
    # See https://zulip.readthedocs.io/en/latest/testing/mypy.html#mypy-in-production-scripts
    from mypy_extensions import NoReturn

import subprocess
import re

states = {
    "OK": 0,
    "WARNING": 1,
    "CRITICAL": 2,
    "UNKNOWN": 3
}

def report(state, msg):
    # type: (str, str) -> NoReturn
    print("%s: %s" % (state, msg))
    exit(states[state])

def get_loc_over_ssh(host, func):
    # type: (str, str) -> str
    try:
        return subprocess.check_output(['ssh', host,
                                        'psql -v ON_ERROR_STOP=1 zulip -t -c "SELECT %s()"' % (func,)],
                                       stderr=subprocess.STDOUT,
                                       universal_newlines=True)
    except subprocess.CalledProcessError as e:
        report('CRITICAL', 'ssh failed: %s: %s' % (str(e), e.output))

def loc_to_abs_offset(loc_str):
    # type: (str) -> int
    m = re.match(r'^\s*([0-9a-fA-F]+)/([0-9a-fA-F]+)\s*$', loc_str)
    if not m:
        raise ValueError("Unknown xlog location format: " + loc_str)
    (xlog_file, file_offset) = (m.group(1), m.group(2))

    # From PostgreSQL 9.2's pg_xlog_location_diff:
    #   result = XLogFileSize * (xlogid1 - xlogid2) + xrecoff1 - xrecoff2
    # Taking xlogid2 and xrecoff2 to be zero to get the absolute offset:
    #   result = XLogFileSize * xlogid1 + xrecoff1
    #
    # xlog_internal.h says:
    #   #define XLogSegSize ((uint32) XLOG_SEG_SIZE)
    #   #define XLogSegsPerFile (((uint32) 0xffffffff) / XLogSegSize)
    #   #define XLogFileSize (XLogSegsPerFile * XLogSegSize)
    #
    # Since XLOG_SEG_SIZE is normally 16MB, XLogFileSize comes out to 0xFF000000
    return 0xFF000000 * int(xlog_file, 16) + int(file_offset, 16)

# Fetch the locations in this order to make the differences positive
# in the normal case given the delay in getting the values via ssh
secondary_replay_loc = get_loc_over_ssh('postgres-secondary.zulip.net', 'pg_last_xlog_replay_location')
secondary_recv_loc   = get_loc_over_ssh('postgres-secondary.zulip.net', 'pg_last_xlog_receive_location')
primary_loc          = get_loc_over_ssh('postgres-primary.zulip.net', 'pg_current_xlog_location')

primary_offset = loc_to_abs_offset(primary_loc)
secondary_recv_offset = loc_to_abs_offset(secondary_recv_loc)
secondary_replay_offset = loc_to_abs_offset(secondary_replay_loc)

recv_diff = primary_offset - secondary_recv_offset
replay_diff = secondary_recv_offset - secondary_replay_offset

# xlog segments are normally 16MB each.  These thresholds are pretty arbitrary.
if recv_diff > 5 * 16 * 1024**2:
    report('CRITICAL', 'secondary is %d bytes behind on receiving xlog' % (recv_diff,))

if replay_diff > 5 * 16 * 1024**2:
    report('CRITICAL', 'secondary is %d bytes behind on applying received xlog' % (replay_diff,))

if recv_diff < 0:
    report('CRITICAL', 'secondary is %d bytes ahead on receiving xlog' % (recv_diff,))

if replay_diff < 0:
    report('CRITICAL', 'secondary is %d bytes ahead on applying received xlog' % (replay_diff,))

if recv_diff > 16 * 1024**2:
    report('WARNING', 'secondary is %d bytes behind on receiving xlog' % (recv_diff,))

if replay_diff > 16 * 1024**2:
    report('WARNING', 'secondary is %d bytes behind on applying received xlog' % (replay_diff,))

report('OK', ('secondary is %d bytes behind on receiving and %d bytes behind on applying xlog'
              % (recv_diff, replay_diff)))
